# Based on:
# https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html

BiocManager::install("Glimma")
BiocManager::install("edgeR")
BiocManager::install("Homo.sapiens")
BiocManager::install("RColorBrewer")
BiocManager::install("repr")

suppressMessages(library(limma))
suppressMessages(library(Glimma))
suppressMessages(library(edgeR))
suppressMessages(library(Homo.sapiens))
suppressMessages(library(RColorBrewer))
suppressMessages(library(repr))

write_tsv = function(dataframe, filename) {
    # If col.names = NA and row.names = TRUE a blank column for the
    # index/rownames is added; ie, keep the column names properly aligned with
    # the column values, but do NOT include a column name for the
    # index/rownames.
    write.table(
        dataframe,
        file = filename,
        sep="\t",
        col.names = NA,
        row.names = TRUE,
        quote = FALSE
    )
    return()
}

# Data files.
files = c('Count_tables.txt'
)

path = files[1]
counts = read.delim(path, row.names=1)

metadata_file = 'Meta_Table_controlonly.txt'
metadata = read.delim(metadata_file, row.names=1)

###Adding in the gene symbols
geneid <- rownames(x)
showMethods(keys)


genes <- select(Homo.sapiens, keys=geneid, columns=c("SYMBOL", "TXCHROM"), 
                keytype="REFSEQ")
head(genes)
##    ENTREZID  SYMBOL TXCHROM
## 1    497097    Xkr4    chr1
## 2 100503874 Gm19938    <NA>
## 3 100038431 Gm10568    <NA>
## 4     19888     Rp1    chr1
## 5     20671   Sox17    chr1
## 6     27395  Mrpl15    chr1

# Join the sample metadata to the (transposed) count data
df = merge(metadata, t(counts), by='row.names')

#Original formating -> rownames(df) = df$Row.names
rownames(df) = df$Group

# There were a couple samples that we previously decided to drop as outliers.
#SMOKER = 'SRR10571728'
#DROPME = 'SRR10971381'

#df = df[df$Row.names != SMOKER & df$Row.names != DROPME, ]

# DGEList.
gene_names = rownames(counts)
feature_names = colnames(metadata)
x = DGEList(
    counts=t(df[ , gene_names]),
    genes=rownames(counts),
    samples=df$feature_names
)

# Features -> factors.

Group <- as.factor(df$Group)
x$samples$group <- Group
group <- Group

# Raw counts are converted to CPM.
cpm = cpm(x)

L = mean(x$samples$lib.size) * 1e-6
M = median(x$samples$lib.size) * 1e-6

message('Mean library size (millions): ', L)
message('Median library size (millions): ', M)

# Raw counts are convert to log2(CPM):
lcpm = cpm(x, log=TRUE)

# Calculate normalization factor using the TMM method.
y = calcNormFactors(x, method='TMM')

# EdgeR.
design = model.matrix(~group)
colnames(design) = c('intercept', 'COVID19')

# Likelihood ratio tests.
y = estimateDisp(y, design)

fit = glmFit(y, design)
lrt = glmLRT(fit, coef=2)

# Top tags.
message('TopTags:')
topTags(lrt)


summary(table)




# Table 1.
padjust_BH = p.adjust(lrt$table$PValue, method='BH')
table = cbind(lrt$table, padjust_BH)


write.csv(as.data.frame(table), file = "eLife_DEGs_Mast_script.csv")


table1_genes = c(
    'A2M',
    'BDKRB1',
    'BDKRB2',
    'F13A1',
    'F13B',
    'F12',
    'F11',
    'F10',
    'F9',
    'F8',
    'F7',
    'F5',
    'F3',
    'F2',
    'FGA',
    'FGB',
    'FGG',
    'KLKB1',
    'KNG1',
    'PLAT',
    'PLAU',
    'PLAUR',
    'PLG',
    'PROC',
    'PROCR',
    'PROS1',
    'SERPINA5',
    'SERPINB2',
    'SERPINC1',
    'SERPIND1',
    'SERPINE1',
    'SERPINF2',
    'SERPING1',
    'TFPI',
    'THBD',
    'VWF'
)
message('Table 1:')
table[table1_genes, ]



# END.

